suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
library(survival)
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
setwd("~/Google Drive/My Drive/Analysis/METTL2A/")

devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
# Directory to save figures 
fig_expression <- 'Figures/TCGA_GTEx/Expression/'
fig_survival   <- 'Figures/TCGA_GTEx/Survival/'
tabledir <- 'Tables/TCGA_GTEx/'

Sys.setenv("VROOM_CONNECTION_SIZE" = 1e+06)

determine_sample_type <- function(df) {
  
  df |>
    mutate(
      sample_type = case_when(
        grepl('Tumor|Primary|Cancer|Metastatic' , `_sample_type`) ~ 'Tumor',
        grepl('Normal', `_sample_type`) ~ 'Normal',
        .default = NA
      )
    )
  
}

fill_primary_site_info <- function(df) {
  
  df |>
    mutate(
      primary_site = ifelse(
        !is.na(primary_site),
        yes = primary_site,
        no = str_extract(detailed_category, '^([A-Za-z]+)')
      )
    ) |>
    mutate(
      primary_site = str_remove(primary_site, '\xca')
    )
  
}

convert_primarysite_name <- function(df) {
  
  df |>
    mutate(
      primary_site_modified = str_replace_all(
        primary_site, 'Adrenal gland', 'Adrenal Gland'
      )
    ) |>
    mutate(
      primary_site_modified = str_replace_all(
        primary_site_modified, 'Blood Vessel|White blood cell', 'Blood'
      )
    )
  
  
}

plot_KM <- function(.genename, df, category) {
  
  filtered_df <- df |>
    dplyr::filter(gene_name == .genename) |>
    dplyr::filter(primary_site_modified == category) |>
    mutate(
      group = ifelse(norm_count > median(norm_count), 'high', 'low')
    )
  
  genename_fit <- surv_fit(
    Surv(OS.time, OS) ~ group,
    data = filtered_df
  )
  print(.genename)
  print(summary(genename_fit)$table)
  
  survivalplot <-
    ggsurvplot(
      genename_fit, pval = TRUE, pval.size = 3,
      title = paste0(.genename, '\n', category),
      palette = c('red', 'blue'),
      ggtheme = theme_survminer(base_size = 8),
      legend = 'bottom', legend.title = '', 
      censor.size = .1
    )
  survivalplot[[1]] |>
    ggsave_multiple_formats(
      outdir = fig_survival,
      basename = paste0(.genename, '_', category),
      width = 2.6, height = 4.2, fontsize = 7)
  
}

Read data

Sample information

TCGAtargetGTEx_sampleinfo <-
  read_TCGAtargetGTEx_sampleinfo() |>
  determine_sample_type() |>
  rename(primary_site = `_primary_site`) |>
  fill_primary_site_info() |>
  convert_primarysite_name()
TCGAtargetGTEx_sampleinfo
## # A tibble: 19,131 × 9
##    sample   detailed_category primary disease or t…¹ primary_site `_sample_type`
##    <chr>    <chr>             <chr>                  <chr>        <chr>         
##  1 TCGA-V4… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
##  2 TCGA-VD… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
##  3 TCGA-V4… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
##  4 TCGA-VD… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
##  5 TCGA-WC… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
##  6 TCGA-WC… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
##  7 TCGA-WC… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
##  8 TCGA-YZ… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
##  9 TCGA-V4… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
## 10 TCGA-WC… Uveal Melanoma    Uveal Melanoma         Eye          Primary Tumor 
## # ℹ 19,121 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 4 more variables: `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>
TCGAtargetGTEx_sampleinfo |>
  filter(primary_site == 'Uterus')
## # A tibble: 135 × 9
##    sample   detailed_category primary disease or t…¹ primary_site `_sample_type`
##    <chr>    <chr>             <chr>                  <chr>        <chr>         
##  1 TCGA-ND… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
##  2 TCGA-NF… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
##  3 TCGA-N8… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
##  4 TCGA-N7… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
##  5 TCGA-N6… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
##  6 TCGA-N6… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
##  7 TCGA-ND… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
##  8 TCGA-N8… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
##  9 TCGA-N6… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
## 10 TCGA-NA… Uterine Carcinos… Uterine Carcinosarcoma Uterus       Primary Tumor 
## # ℹ 125 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 4 more variables: `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>
TCGAtargetGTEx_sampleinfo |>
  filter(grepl('Uterine', detailed_category))
## # A tibble: 261 × 9
##    sample   detailed_category primary disease or t…¹ primary_site `_sample_type`
##    <chr>    <chr>             <chr>                  <chr>        <chr>         
##  1 TCGA-AJ… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
##  2 TCGA-BG… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
##  3 TCGA-AX… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
##  4 TCGA-DI… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
##  5 TCGA-AJ… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
##  6 TCGA-BG… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
##  7 TCGA-BG… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
##  8 TCGA-AX… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
##  9 TCGA-AX… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
## 10 TCGA-AX… Uterine Corpus E… Uterine Corpus Endome… Endometrium  Solid Tissue …
## # ℹ 251 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 4 more variables: `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>

Read survival data

TCGA_survival <-
  read_TCGA_survival()
TCGA_survival
## # A tibble: 10,496 × 9
##    sample             OS OS.time   DSS DSS.time   DFI DFI.time   PFI PFI.time
##    <chr>           <dbl>   <dbl> <dbl>    <dbl> <dbl>    <dbl> <dbl>    <dbl>
##  1 TCGA-OR-A5J1-01     1    1355     1     1355     1      754     1      754
##  2 TCGA-OR-A5J2-01     1    1677     1     1677    NA       NA     1      289
##  3 TCGA-OR-A5J3-01     0    2091     0     2091     1       53     1       53
##  4 TCGA-OR-A5J5-01     1     365     1      365    NA       NA     1       50
##  5 TCGA-OR-A5J6-01     0    2703     0     2703     0     2703     0     2703
##  6 TCGA-OR-A5J7-01     1     490     1      490    NA       NA     1      162
##  7 TCGA-OR-A5J8-01     1     579     1      579     1      530     1      530
##  8 TCGA-OR-A5J9-01     0    1352     0     1352     1      414     1      414
##  9 TCGA-OR-A5JA-01     1     922     1      922    NA       NA     1      922
## 10 TCGA-OR-A5JB-01     1     551    NA      551    NA       NA     0      551
## # ℹ 10,486 more rows

Read m3C writer expression

TCGAtargetGTEx_norm_genecounts <-
  read_TCGAtargetGTEx_norm.genecount_deseq2()
## New names:
## • `` -> `...1`
TCGAtargetGTEx_norm_genecounts
## # A tibble: 60,498 × 19,041
##    gene_id       gene_name `TCGA-AD-5900-01` `TCGA-BP-4968-01` `TCGA-NG-A4VU-01`
##    <chr>         <chr>                 <dbl>             <dbl>             <dbl>
##  1 ENSG00000242… RP11-368…              0                 1.81              0   
##  2 ENSG00000259… RP11-167…              0                 0                 0   
##  3 ENSG00000270… RP11-742…              0                 2.37              4.04
##  4 ENSG00000167… RAB4B                  9.39             10.9               8.35
##  5 ENSG00000278… AC104071…              0                 0                 0   
##  6 ENSG00000078… TIGAR                  8.44              9.10              8.36
##  7 ENSG00000269… LINC01224              7.08              3.09              6.32
##  8 ENSG00000263… MIR4802                0                 0                 0   
##  9 ENSG00000146… RNF44                 11.2              11.9              11.5 
## 10 ENSG00000158… DNAH3                  5.47              2.77              3.70
## # ℹ 60,488 more rows
## # ℹ 19,036 more variables: `TCGA-CG-4305-01` <dbl>, `TCGA-AO-A03M-01` <dbl>,
## #   `TCGA-ZH-A8Y6-01` <dbl>, `TCGA-HT-7686-01` <dbl>, `TCGA-BR-6458-11` <dbl>,
## #   `TCGA-29-1699-01` <dbl>, `TCGA-KK-A6E1-01` <dbl>, `TCGA-DX-A7EO-01` <dbl>,
## #   `TCGA-AO-A0JD-01` <dbl>, `TCGA-TM-A84Q-01` <dbl>, `TCGA-SR-A6MQ-01` <dbl>,
## #   `TCGA-CV-7103-01` <dbl>, `TCGA-EL-A4KD-01` <dbl>, `TCGA-CJ-4890-01` <dbl>,
## #   `TCGA-EL-A3ZH-01` <dbl>, `TCGA-A2-A0CT-01` <dbl>, …
TCGAtargetGTEx_gene_TPMs <-
  read_TCGAtargetGTEx_rsem_gene_TPM()
TCGAtargetGTEx_gene_TPMs
## # A tibble: 60,498 × 19,133
##    gene_id  gene_name GTEX-S4Q7-0003-SM-3N…¹ `TCGA-19-1787-01` `TCGA-S9-A7J2-01`
##    <chr>    <chr>                      <dbl>             <dbl>             <dbl>
##  1 ENSG000… RP11-368…                  -3.46            -9.97              0.300
##  2 ENSG000… RP11-167…                  -9.97            -9.97             -9.97 
##  3 ENSG000… RP11-742…                  -3.63            -3.82             -3.05 
##  4 ENSG000… RAB4B                       4.60             5.30              4.89 
##  5 ENSG000… AC104071…                  -9.97            -9.97             -9.97 
##  6 ENSG000… TIGAR                       2.26             3.51              2.30 
##  7 ENSG000… LINC01224                  -6.51             0.865            -1.03 
##  8 ENSG000… MIR4802                    -9.97            -9.97             -9.97 
##  9 ENSG000… RNF44                       5.78             4.25              5.38 
## 10 ENSG000… DNAH3                      -4.29            -5.01             -9.97 
## # ℹ 60,488 more rows
## # ℹ abbreviated name: ¹​`GTEX-S4Q7-0003-SM-3NM8M`
## # ℹ 19,128 more variables: `GTEX-QV31-1626-SM-2S1QC` <dbl>,
## #   `TCGA-G3-A3CH-11` <dbl>, `TCGA-B5-A5OE-01` <dbl>,
## #   `GTEX-13QIC-0011-R1a-SM-5O9CJ` <dbl>, `TCGA-B2-5641-11` <dbl>,
## #   `GTEX-ZPCL-0126-SM-4WWC8` <dbl>, `TARGET-20-PANGDN-09` <dbl>,
## #   `GTEX-S33H-1226-SM-4AD69` <dbl>, `GTEX-X88G-0426-SM-47JZ5` <dbl>, …
m3C_writers_TCGAtargetGTEx_norm_genecounts <-
  TCGAtargetGTEx_norm_genecounts |>
  filter(grepl('^METTL[2|6|8][AB]?$', gene_name)) |>
  pivot_longer(
    cols = -c(gene_id, gene_name),
    names_to = 'sample', values_to = 'norm_count'
  ) |>
  left_join(TCGAtargetGTEx_sampleinfo) |>
  filter(!is.na(sample_type))
## Joining with `by = join_by(sample)`
m3C_writers_TCGAtargetGTEx_norm_genecounts
## # A tibble: 74,700 × 12
##    gene_id  gene_name sample norm_count detailed_category primary disease or t…¹
##    <chr>    <chr>     <chr>       <dbl> <chr>             <chr>                 
##  1 ENSG000… METTL8    TCGA-…       9.55 Colon Adenocarci… Colon Adenocarcinoma  
##  2 ENSG000… METTL8    TCGA-…       9.45 Kidney Clear Cel… Kidney Clear Cell Car…
##  3 ENSG000… METTL8    TCGA-…      10.4  Uterine Carcinos… Uterine Carcinosarcoma
##  4 ENSG000… METTL8    TCGA-…      10.1  Stomach Adenocar… Stomach Adenocarcinoma
##  5 ENSG000… METTL8    TCGA-…       9.49 Breast Invasive … Breast Invasive Carci…
##  6 ENSG000… METTL8    TCGA-…       9.23 Cholangiocarcino… Cholangiocarcinoma    
##  7 ENSG000… METTL8    TCGA-…       9.48 Brain Lower Grad… Brain Lower Grade Gli…
##  8 ENSG000… METTL8    TCGA-…       9.34 Stomach Adenocar… Stomach Adenocarcinoma
##  9 ENSG000… METTL8    TCGA-…       9.55 Ovarian Serous C… Ovarian Serous Cystad…
## 10 ENSG000… METTL8    TCGA-…      10.6  Prostate Adenoca… Prostate Adenocarcino…
## # ℹ 74,690 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 6 more variables: primary_site <chr>, `_sample_type` <chr>,
## #   `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>
m3C_writers_TCGAtargetGTEx_gene_TPMs <-
  TCGAtargetGTEx_gene_TPMs |>
  filter(grepl('^METTL[2|6|8][AB]?$', gene_name)) |>
  pivot_longer(
    cols = -c(gene_id, gene_name),
    names_to = 'sample', values_to = 'TPM'
  ) |>
  left_join(TCGAtargetGTEx_sampleinfo) |>
  filter(!is.na(sample_type))
## Joining with `by = join_by(sample)`
m3C_writers_TCGAtargetGTEx_gene_TPMs
## # A tibble: 74,788 × 12
##    gene_id       gene_name sample   TPM detailed_category primary disease or t…¹
##    <chr>         <chr>     <chr>  <dbl> <chr>             <chr>                 
##  1 ENSG00000123… METTL8    TCGA-… 3.25  Glioblastoma Mul… Glioblastoma Multifor…
##  2 ENSG00000123… METTL8    TCGA-… 2.52  Brain Lower Grad… Brain Lower Grade Gli…
##  3 ENSG00000123… METTL8    GTEX-… 3.09  Artery - Tibial   Artery - Tibial       
##  4 ENSG00000123… METTL8    TCGA-… 0.896 Liver Hepatocell… Liver Hepatocellular …
##  5 ENSG00000123… METTL8    TCGA-… 4.38  Uterine Corpus E… Uterine Corpus Endome…
##  6 ENSG00000123… METTL8    GTEX-… 2.06  Brain - Hippocam… Brain - Hippocampus   
##  7 ENSG00000123… METTL8    TCGA-… 2.32  Kidney Clear Cel… Kidney Clear Cell Car…
##  8 ENSG00000123… METTL8    GTEX-… 1.90  Thyroid           Thyroid               
##  9 ENSG00000123… METTL8    TARGE… 1.64  Acute Myeloid Le… Acute Myeloid Leukemia
## 10 ENSG00000123… METTL8    GTEX-… 0.178 Pancreas          Pancreas              
## # ℹ 74,778 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 6 more variables: primary_site <chr>, `_sample_type` <chr>,
## #   `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>

Plot distribution

m3C_writers_TCGAtargetGTEx_norm_genecounts |>
  ggplot(aes(
    x = primary_site_modified, y = norm_count,
    colour = sample_type, fill = sample_type)
  ) +
  geom_violin() +
  scale_y_log10() +
  coord_flip() +
  facet_wrap( ~ gene_name)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 114 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

Calculate mean expression in normal tissues

median_normcount_in_normal <-
  m3C_writers_TCGAtargetGTEx_norm_genecounts |>
  filter(sample_type == 'Normal') |>
  group_by(gene_name, primary_site_modified) |>
  reframe(median_in_normal = median(norm_count, na.rm = TRUE))
median_normcount_in_normal
## # A tibble: 152 × 3
##    gene_name primary_site_modified median_in_normal
##    <chr>     <chr>                            <dbl>
##  1 METTL2A   Adipose Tissue                    7.83
##  2 METTL2A   Adrenal Gland                     8.38
##  3 METTL2A   Bile duct                         8.87
##  4 METTL2A   Bladder                           9.24
##  5 METTL2A   Blood                             7.75
##  6 METTL2A   Brain                             8.27
##  7 METTL2A   Breast                            8.12
##  8 METTL2A   Cervix                            9.75
##  9 METTL2A   Cervix Uteri                      7.66
## 10 METTL2A   Colon                             8.14
## # ℹ 142 more rows
# mean_TPM_in_normal <-
#   m3C_writers_TCGAtargetGTEx_gene_TPMs |>
#   filter(sample_type == 'Normal') |>
#   group_by(gene_name, primary_site_modified) |>
#   reframe(mean_in_normal = mean(TPM, na.rm = TRUE))
# mean_TPM_in_normal

Calculate relative expression

m3C_writers_TCGAtargetGTEx_rel_expression_normcount <-
  m3C_writers_TCGAtargetGTEx_norm_genecounts |>
  left_join(median_normcount_in_normal) |>
  mutate(rel_expression = norm_count - median_in_normal)
## Joining with `by = join_by(gene_name, primary_site_modified)`
m3C_writers_TCGAtargetGTEx_rel_expression_normcount
## # A tibble: 74,700 × 14
##    gene_id  gene_name sample norm_count detailed_category primary disease or t…¹
##    <chr>    <chr>     <chr>       <dbl> <chr>             <chr>                 
##  1 ENSG000… METTL8    TCGA-…       9.55 Colon Adenocarci… Colon Adenocarcinoma  
##  2 ENSG000… METTL8    TCGA-…       9.45 Kidney Clear Cel… Kidney Clear Cell Car…
##  3 ENSG000… METTL8    TCGA-…      10.4  Uterine Carcinos… Uterine Carcinosarcoma
##  4 ENSG000… METTL8    TCGA-…      10.1  Stomach Adenocar… Stomach Adenocarcinoma
##  5 ENSG000… METTL8    TCGA-…       9.49 Breast Invasive … Breast Invasive Carci…
##  6 ENSG000… METTL8    TCGA-…       9.23 Cholangiocarcino… Cholangiocarcinoma    
##  7 ENSG000… METTL8    TCGA-…       9.48 Brain Lower Grad… Brain Lower Grade Gli…
##  8 ENSG000… METTL8    TCGA-…       9.34 Stomach Adenocar… Stomach Adenocarcinoma
##  9 ENSG000… METTL8    TCGA-…       9.55 Ovarian Serous C… Ovarian Serous Cystad…
## 10 ENSG000… METTL8    TCGA-…      10.6  Prostate Adenoca… Prostate Adenocarcino…
## # ℹ 74,690 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 8 more variables: primary_site <chr>, `_sample_type` <chr>,
## #   `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>, median_in_normal <dbl>, rel_expression <dbl>
# m3C_writers_TCGAtargetGTEx_rel_expression_TPM <-
#   m3C_writers_TCGAtargetGTEx_gene_TPMs |>
#   left_join(mean_TPM_in_normal) |>
#   mutate(rel_expression = TPM / mean_in_normal)
# m3C_writers_TCGAtargetGTEx_rel_expression_TPM

Plot heatmap

m3C_writers_TCGAtargetGTEx_rel_expression_normcount_summary <- 
  m3C_writers_TCGAtargetGTEx_rel_expression_normcount |> 
  filter(gene_name != 'ALKBH1') |> 
  filter(sample_type == 'Tumor') |>
  group_by(gene_name, sample_type, primary_site_modified) |> 
  reframe(
    n = n(),
    mean = mean(rel_expression),
    median = median(rel_expression)
  ) |> 
  arrange(-mean)
m3C_writers_TCGAtargetGTEx_rel_expression_normcount_summary |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: Tables/TCGA_GTEx/m3C_writers_TCGAtargetGTEx_rel_expression_normcount_summary_2024-07-29.tsv
## # A tibble: 120 × 6
##    gene_name sample_type primary_site_modified     n  mean median
##    <chr>     <chr>       <chr>                 <int> <dbl>  <dbl>
##  1 METTL2A   Tumor       Ovary                   427  2.47   2.43
##  2 METTL2A   Tumor       Uterus                   57  2.39   2.40
##  3 METTL2A   Tumor       Lung                   1013  2.35   2.31
##  4 METTL2B   Tumor       Testis                  154  2.30   2.32
##  5 METTL8    Tumor       Stomach                 413  2.15   2.21
##  6 METTL2B   Tumor       Uterus                   57  2.00   2.04
##  7 METTL8    Tumor       Pancreas                179  1.96   1.95
##  8 METTL2B   Tumor       Ovary                   427  1.94   1.99
##  9 METTL2A   Tumor       Breast                 1099  1.94   1.81
## 10 METTL8    Tumor       Lung                   1013  1.93   1.89
## # ℹ 110 more rows
m3C_relexpression_in_tumor_heatmap <- 
  m3C_writers_TCGAtargetGTEx_rel_expression_normcount_summary |> 
  filter(!is.na(median)) |> 
  ggplot(aes(
    x = gene_name |> str_remove('METTL'), 
    y = reorder(primary_site_modified, mean), 
    fill = mean
  )) +
  geom_tile() +
  scale_fill_gradient2(
    low = 'blue', mid = 'white', high = 'red', midpoint = 0, 
    limits = c(-2.5, 2.5)
  ) +
  labs(x = '', y = '') +
  theme_minimal(base_size = 8) +
  theme(
    legend.position = 'bottom', 
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))
m3C_relexpression_in_tumor_heatmap |> 
  ggsave_multiple_formats(
    outdir = fig_expression, 
    width = 4.5, height = 9, units = 'cm', fontsize = 7
  )

Plot distribution of relative expression

m3C_writers_TCGAtargetGTEx_rel_expression_normcount |>
  filter(gene_name != 'ALKBH1') |> 
  filter(sample_type == 'Tumor') |>
  filter(!is.na(rel_expression)) |>
  ggplot(aes(
    x = reorder(primary_site_modified, rel_expression),
    y = rel_expression)) +
  geom_violin() +
  geom_hline(yintercept = 1) +
  scale_y_log10(limits = c(0.4, 2.2), breaks = c(0.5, 0.8, 1, 1.25, 2)) +
  labs(x = '', y = 'relative expression (normal = 1)') +
  coord_flip() +
  facet_wrap( ~ gene_name)
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(limits = c(0.4, 2.2), breaks = c(0.5, 0.8, 1, 1.25, :
## log-10 transformation introduced infinite values.
## Warning: Removed 14741 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

m3C_writers_TCGAtargetGTEx_rel_expression_normcount |>
  filter(gene_name != 'ALKBH1') |> 
  filter(sample_type == 'Tumor') |>
  ggplot(aes(
    x = reorder(primary_site_modified, rel_expression, median),
    y = rel_expression)) +
  geom_violin() +
  geom_hline(yintercept = 1) +
  scale_y_log10() +
  coord_flip() +
  facet_wrap( ~ gene_name)
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 6898 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

test

m3C_relexpression_in_tumor_pvalues <- 
  m3C_writers_TCGAtargetGTEx_rel_expression_normcount |> 
  filter(gene_name != 'ALKBH1') |> 
  right_join(
    m3C_writers_TCGAtargetGTEx_rel_expression_normcount_summary |> 
      select(primary_site_modified) |> 
      distinct()
  ) |> 
  filter(!is.na(rel_expression)) |> 
  group_by(gene_name, primary_site_modified) |>
  rstatix::wilcox_test(rel_expression ~ sample_type, ref.group = 'Normal') |> 
  left_join(m3C_writers_TCGAtargetGTEx_rel_expression_normcount_summary) |>
  filter(!is.na(mean))
## Joining with `by = join_by(primary_site_modified)`
## Joining with `by = join_by(gene_name, primary_site_modified)`
m3C_relexpression_in_tumor_pvalues 
## # A tibble: 104 × 13
##    gene_name primary_site_modified .y.       group1 group2    n1    n2 statistic
##    <chr>     <chr>                 <chr>     <chr>  <chr>  <int> <int>     <dbl>
##  1 METTL2A   Adrenal Gland         rel_expr… Normal Tumor    126    77     1394 
##  2 METTL2A   Bile duct             rel_expr… Normal Tumor      9    36        3 
##  3 METTL2A   Bladder               rel_expr… Normal Tumor     28   407     2066 
##  4 METTL2A   Blood                 rel_expr… Normal Tumor    941   595    71280 
##  5 METTL2A   Brain                 rel_expr… Normal Tumor   1153   689     2586.
##  6 METTL2A   Breast                rel_expr… Normal Tumor    291  1099    22319 
##  7 METTL2A   Cervix                rel_expr… Normal Tumor      3   305      249 
##  8 METTL2A   Colon                 rel_expr… Normal Tumor    348   289     5250.
##  9 METTL2A   Endometrium           rel_expr… Normal Tumor     23   181      707 
## 10 METTL2A   Esophagus             rel_expr… Normal Tumor    667   182     1207 
## # ℹ 94 more rows
## # ℹ 5 more variables: p <dbl>, sample_type <chr>, n <int>, mean <dbl>,
## #   median <dbl>
m3C_relexpression_in_tumor_pvalues |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: Tables/TCGA_GTEx/m3C_relexpression_in_tumor_pvalues_2024-07-29.tsv
## # A tibble: 104 × 13
##    gene_name primary_site_modified .y.       group1 group2    n1    n2 statistic
##    <chr>     <chr>                 <chr>     <chr>  <chr>  <int> <int>     <dbl>
##  1 METTL2A   Adrenal Gland         rel_expr… Normal Tumor    126    77     1394 
##  2 METTL2A   Bile duct             rel_expr… Normal Tumor      9    36        3 
##  3 METTL2A   Bladder               rel_expr… Normal Tumor     28   407     2066 
##  4 METTL2A   Blood                 rel_expr… Normal Tumor    941   595    71280 
##  5 METTL2A   Brain                 rel_expr… Normal Tumor   1153   689     2586.
##  6 METTL2A   Breast                rel_expr… Normal Tumor    291  1099    22319 
##  7 METTL2A   Cervix                rel_expr… Normal Tumor      3   305      249 
##  8 METTL2A   Colon                 rel_expr… Normal Tumor    348   289     5250.
##  9 METTL2A   Endometrium           rel_expr… Normal Tumor     23   181      707 
## 10 METTL2A   Esophagus             rel_expr… Normal Tumor    667   182     1207 
## # ℹ 94 more rows
## # ℹ 5 more variables: p <dbl>, sample_type <chr>, n <int>, mean <dbl>,
## #   median <dbl>
m3C_writers_TCGAtargetGTEx_rel_expression_normcount_summary |> 
  arrange(median)
## # A tibble: 120 × 6
##    gene_name sample_type primary_site_modified     n    mean  median
##    <chr>     <chr>       <chr>                 <int>   <dbl>   <dbl>
##  1 METTL8    Tumor       Paraganglia             182 -0.451  -0.423 
##  2 METTL8    Tumor       Soft tissue,Bone        262 -0.385  -0.343 
##  3 METTL6    Tumor       Testis                  154 -0.275  -0.284 
##  4 METTL2A   Tumor       Paraganglia             182 -0.256  -0.225 
##  5 METTL8    Tumor       Endometrium             181 -0.109  -0.182 
##  6 METTL2B   Tumor       Cervix                  305 -0.182  -0.173 
##  7 METTL2B   Tumor       Head and Neck region    520 -0.169  -0.169 
##  8 METTL8    Tumor       Rectum                   93 -0.174  -0.158 
##  9 METTL8    Tumor       Thyroid Gland           512 -0.169  -0.133 
## 10 METTL2A   Tumor       Rectum                   93 -0.0836 -0.0511
## # ℹ 110 more rows
m3C_relexpression_in_tumor_pvalues_heatmap <- 
  m3C_relexpression_in_tumor_pvalues  |> 
  mutate(
    minuslog10p = -log10(p),
    group = case_when(
      p < .05 & mean > 0 ~ 'A',
      p < .05 & mean < 0 ~ 'B',
      .default = 'C'
    )
  ) |> 
  ggplot(aes(
    x = gene_name |> str_remove('METTL'),
    y = reorder(primary_site_modified, mean),
    fill = group
  )) +
  geom_tile() +
  labs(x = '', y = '') +
  scale_fill_manual(values = c('red', 'blue', 'gray')) +
  theme_minimal(base_size = 8) +
  theme(
    legend.position = 'bottom', 
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))
m3C_relexpression_in_tumor_pvalues_heatmap |> 
  ggsave_multiple_formats(
    outdir = fig_expression, 
    width = 4.5, height = 9, units = 'cm', fontsize = 7
  )

# m3C_writers_TCGAtargetGTEx_rel_expression_TPM |>
#   filter(sample_type == 'Tumor') |>
#   ggplot(aes(
#     x = reorder(primary_site_modified, rel_expression),
#     y = rel_expression)) +
#   geom_violin() +
#   geom_hline(yintercept = 1) +
#   scale_y_log10() +
#   coord_flip() +
#   facet_wrap( ~ gene_name)

Survival analysis

# m3C_writers_TCGA_gene_TPMs_survival <-
#   m3C_writers_TCGAtargetGTEx_gene_TPMs |>
#   right_join(TCGA_survival)
# m3C_writers_TCGA_gene_TPMs_survival

m3C_writers_TCGA_norm_genecounts_survival <-
  m3C_writers_TCGAtargetGTEx_norm_genecounts |>
  right_join(TCGA_survival)
## Joining with `by = join_by(sample)`
m3C_writers_TCGA_norm_genecounts_survival
## # A tibble: 41,969 × 20
##    gene_id  gene_name sample norm_count detailed_category primary disease or t…¹
##    <chr>    <chr>     <chr>       <dbl> <chr>             <chr>                 
##  1 ENSG000… METTL8    TCGA-…       9.55 Colon Adenocarci… Colon Adenocarcinoma  
##  2 ENSG000… METTL8    TCGA-…       9.45 Kidney Clear Cel… Kidney Clear Cell Car…
##  3 ENSG000… METTL8    TCGA-…      10.4  Uterine Carcinos… Uterine Carcinosarcoma
##  4 ENSG000… METTL8    TCGA-…      10.1  Stomach Adenocar… Stomach Adenocarcinoma
##  5 ENSG000… METTL8    TCGA-…       9.49 Breast Invasive … Breast Invasive Carci…
##  6 ENSG000… METTL8    TCGA-…       9.23 Cholangiocarcino… Cholangiocarcinoma    
##  7 ENSG000… METTL8    TCGA-…       9.48 Brain Lower Grad… Brain Lower Grade Gli…
##  8 ENSG000… METTL8    TCGA-…       9.34 Stomach Adenocar… Stomach Adenocarcinoma
##  9 ENSG000… METTL8    TCGA-…       9.55 Ovarian Serous C… Ovarian Serous Cystad…
## 10 ENSG000… METTL8    TCGA-…      10.6  Prostate Adenoca… Prostate Adenocarcino…
## # ℹ 41,959 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 14 more variables: primary_site <chr>, `_sample_type` <chr>,
## #   `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>, OS <dbl>, OS.time <dbl>, DSS <dbl>,
## #   DSS.time <dbl>, DFI <dbl>, DFI.time <dbl>, PFI <dbl>, PFI.time <dbl>
m3C_writers_TCGA_norm_genecounts_survival |> 
  export_tsv(outdir = tabledir, compression = 'gz')
## 
## Exported to: Tables/TCGA_GTEx/m3C_writers_TCGA_norm_genecounts_survival_2024-07-29.tsv.gz
## # A tibble: 41,969 × 20
##    gene_id  gene_name sample norm_count detailed_category primary disease or t…¹
##    <chr>    <chr>     <chr>       <dbl> <chr>             <chr>                 
##  1 ENSG000… METTL8    TCGA-…       9.55 Colon Adenocarci… Colon Adenocarcinoma  
##  2 ENSG000… METTL8    TCGA-…       9.45 Kidney Clear Cel… Kidney Clear Cell Car…
##  3 ENSG000… METTL8    TCGA-…      10.4  Uterine Carcinos… Uterine Carcinosarcoma
##  4 ENSG000… METTL8    TCGA-…      10.1  Stomach Adenocar… Stomach Adenocarcinoma
##  5 ENSG000… METTL8    TCGA-…       9.49 Breast Invasive … Breast Invasive Carci…
##  6 ENSG000… METTL8    TCGA-…       9.23 Cholangiocarcino… Cholangiocarcinoma    
##  7 ENSG000… METTL8    TCGA-…       9.48 Brain Lower Grad… Brain Lower Grade Gli…
##  8 ENSG000… METTL8    TCGA-…       9.34 Stomach Adenocar… Stomach Adenocarcinoma
##  9 ENSG000… METTL8    TCGA-…       9.55 Ovarian Serous C… Ovarian Serous Cystad…
## 10 ENSG000… METTL8    TCGA-…      10.6  Prostate Adenoca… Prostate Adenocarcino…
## # ℹ 41,959 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 14 more variables: primary_site <chr>, `_sample_type` <chr>,
## #   `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>, OS <dbl>, OS.time <dbl>, DSS <dbl>,
## #   DSS.time <dbl>, DFI <dbl>, DFI.time <dbl>, PFI <dbl>, PFI.time <dbl>
temp <- 
  m3C_writers_TCGA_norm_genecounts_survival |> 
  filter(gene_name == 'METTL2A') |>
  filter(primary_site_modified == 'Pancreas') |> 
  #group_by(primary_site_modified) |> 
  mutate(
    group = ifelse(norm_count > median(norm_count), 'high', 'low')
  )
temp
## # A tibble: 183 × 21
##    gene_id  gene_name sample norm_count detailed_category primary disease or t…¹
##    <chr>    <chr>     <chr>       <dbl> <chr>             <chr>                 
##  1 ENSG000… METTL2A   TCGA-…       9.68 Pancreatic Adeno… Pancreatic Adenocarci…
##  2 ENSG000… METTL2A   TCGA-…       9.5  Pancreatic Adeno… Pancreatic Adenocarci…
##  3 ENSG000… METTL2A   TCGA-…       9.33 Pancreatic Adeno… Pancreatic Adenocarci…
##  4 ENSG000… METTL2A   TCGA-…       9.64 Pancreatic Adeno… Pancreatic Adenocarci…
##  5 ENSG000… METTL2A   TCGA-…       9.08 Pancreatic Adeno… Pancreatic Adenocarci…
##  6 ENSG000… METTL2A   TCGA-…       9.73 Pancreatic Adeno… Pancreatic Adenocarci…
##  7 ENSG000… METTL2A   TCGA-…      10.4  Pancreatic Adeno… Pancreatic Adenocarci…
##  8 ENSG000… METTL2A   TCGA-…       9.49 Pancreatic Adeno… Pancreatic Adenocarci…
##  9 ENSG000… METTL2A   TCGA-…      10.1  Pancreatic Adeno… Pancreatic Adenocarci…
## 10 ENSG000… METTL2A   TCGA-…       9.55 Pancreatic Adeno… Pancreatic Adenocarci…
## # ℹ 173 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 15 more variables: primary_site <chr>, `_sample_type` <chr>,
## #   `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>, OS <dbl>, OS.time <dbl>, DSS <dbl>,
## #   DSS.time <dbl>, DFI <dbl>, DFI.time <dbl>, PFI <dbl>, PFI.time <dbl>,
## #   group <chr>
do_coxph_2group <- function(df) {
  
  df <- df |>
    mutate(
      group = ifelse(norm_count > median(norm_count), 'high', 'low')
    )
  
  coxph(Surv(OS.time, OS) ~ group, df) |>
    broom::tidy()
  
}
temp |> 
  do_coxph_2group()
## # A tibble: 1 × 5
##   term     estimate std.error statistic p.value
##   <chr>       <dbl>     <dbl>     <dbl>   <dbl>
## 1 grouplow   -0.448     0.211     -2.12  0.0336
m3C_writers_TCGA_norm_genecounts_survival_coxph <- 
  m3C_writers_TCGA_norm_genecounts_survival |> 
  filter(!is.na(primary_site_modified)) |>
  group_by(gene_name, primary_site_modified) |> 
  nest() |> 
  mutate(model = map(data, do_coxph_2group)) |> 
  select(-data) |> 
  unnest(cols = model) |> 
  mutate(group = case_when(
    estimate > 0 & p.value < .05 ~ 'favorable in high',
    estimate < 0 & p.value < .05 ~ 'unfavorable in high',
    .default = 'not significant'
  )
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `model = map(data, do_coxph_2group)`.
## ℹ In group 55: `gene_name = "METTL2B"` and `primary_site_modified = "Testis"`.
## Caused by warning in `coxph.fit()`:
## ! Loglik converged before variable  1 ; coefficient may be infinite.
m3C_writers_TCGA_norm_genecounts_survival_coxph
## # A tibble: 116 × 8
## # Groups:   gene_name, primary_site_modified [116]
##    gene_name primary_site_modified term     estimate std.error statistic p.value
##    <chr>     <chr>                 <chr>       <dbl>     <dbl>     <dbl>   <dbl>
##  1 METTL8    Colon                 grouplow  0.00905     0.222    0.0407 9.68e-1
##  2 METTL8    Kidney                grouplow -0.589       0.128   -4.61   3.94e-6
##  3 METTL8    Uterus                grouplow  0.764       0.351    2.18   2.96e-2
##  4 METTL8    Stomach               grouplow -0.185       0.155   -1.20   2.32e-1
##  5 METTL8    Breast                grouplow  0.112       0.143    0.785  4.32e-1
##  6 METTL8    Bile duct             grouplow  0.122       0.427    0.285  7.75e-1
##  7 METTL8    Brain                 grouplow -0.418       0.125   -3.35   8.23e-4
##  8 METTL8    Ovary                 grouplow  0.335       0.124    2.71   6.71e-3
##  9 METTL8    Prostate              grouplow  0.343       0.648    0.530  5.96e-1
## 10 METTL8    Soft tissue,Bone      grouplow  0.538       0.205    2.63   8.62e-3
## # ℹ 106 more rows
## # ℹ 1 more variable: group <chr>
m3C_writers_TCGA_norm_genecounts_survival_coxph |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: Tables/TCGA_GTEx/m3C_writers_TCGA_norm_genecounts_survival_coxph_2024-07-29.tsv
## # A tibble: 116 × 8
## # Groups:   gene_name, primary_site_modified [116]
##    gene_name primary_site_modified term     estimate std.error statistic p.value
##    <chr>     <chr>                 <chr>       <dbl>     <dbl>     <dbl>   <dbl>
##  1 METTL8    Colon                 grouplow  0.00905     0.222    0.0407 9.68e-1
##  2 METTL8    Kidney                grouplow -0.589       0.128   -4.61   3.94e-6
##  3 METTL8    Uterus                grouplow  0.764       0.351    2.18   2.96e-2
##  4 METTL8    Stomach               grouplow -0.185       0.155   -1.20   2.32e-1
##  5 METTL8    Breast                grouplow  0.112       0.143    0.785  4.32e-1
##  6 METTL8    Bile duct             grouplow  0.122       0.427    0.285  7.75e-1
##  7 METTL8    Brain                 grouplow -0.418       0.125   -3.35   8.23e-4
##  8 METTL8    Ovary                 grouplow  0.335       0.124    2.71   6.71e-3
##  9 METTL8    Prostate              grouplow  0.343       0.648    0.530  5.96e-1
## 10 METTL8    Soft tissue,Bone      grouplow  0.538       0.205    2.63   8.62e-3
## # ℹ 106 more rows
## # ℹ 1 more variable: group <chr>
m3C_prognosis_in_tumors_heatmap <- 
  m3C_writers_TCGA_norm_genecounts_survival_coxph |> 
  filter(gene_name != 'ALKBH1') |> 
  left_join(m3C_writers_TCGAtargetGTEx_rel_expression_normcount_summary) |>
  filter(!is.na(mean)) |>
  ggplot(aes(
    x = gene_name |> str_remove('METTL'),
    y = reorder(primary_site_modified, mean),
    fill = group
  )) +
  geom_tile() +
  scale_fill_manual(values = c('blue', 'gray', 'red')) +
  labs(x = '', y = '') +
  theme_minimal(base_size = 8) +
  theme(
    legend.position = 'bottom',
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))
## Joining with `by = join_by(gene_name, primary_site_modified)`
m3C_prognosis_in_tumors_heatmap |> 
  ggsave_multiple_formats(
    outdir = fig_survival, 
    width = 4.5, height = 9, units = 'cm', fontsize = 7
  )

#figdir_survival <- 'Figures/TCGA/Survival/Temp/'

plot_KM('METTL2A', m3C_writers_TCGA_norm_genecounts_survival, 'Pancreas')
## [1] "METTL2A"
##            records n.max n.start events     rmean se(rmean) median 0.95LCL
## group=high      91    91      91     58  875.9087  107.4757    532     470
## group=low       92    92      92     37 1375.0710  164.4187    695     592
##            0.95UCL
## group=high     684
## group=low       NA

plot_KM('METTL2A', m3C_writers_TCGA_norm_genecounts_survival, 'Uterus')
## [1] "METTL2A"
##            records n.max n.start events     rmean se(rmean) median 0.95LCL
## group=high      28    28      28     11 2356.7010  397.0849   3115     714
## group=low       29    29      29     24  710.7536  125.3212    541     418
##            0.95UCL
## group=high      NA
## group=low      911

plot_KM('METTL2A', m3C_writers_TCGA_norm_genecounts_survival, 'Ovary')
## [1] "METTL2A"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     212   212     212    133 1728.303  119.3594   1348    1199
## group=low      213   213     213    132 1791.977  131.1958   1355    1213
##            0.95UCL
## group=high    1492
## group=low     1650

m3C_writers <- c('METTL2A', 'METTL2B', 'METTL6', 'METTL8')

m3C_writers |>
  walk(
    plot_KM,
    df = m3C_writers_TCGA_norm_genecounts_survival,
    category = 'Pancreas'
  )
## [1] "METTL2A"
##            records n.max n.start events     rmean se(rmean) median 0.95LCL
## group=high      91    91      91     58  875.9087  107.4757    532     470
## group=low       92    92      92     37 1375.0710  164.4187    695     592
##            0.95UCL
## group=high     684
## group=low       NA
## [1] "METTL2B"
##            records n.max n.start events     rmean se(rmean) median 0.95LCL
## group=high      91    91      91     54  958.7456  124.0640    518     466
## group=low       92    92      92     41 1198.6804  152.4417    627     592
##            0.95UCL
## group=high     732
## group=low       NA
## [1] "METTL6"
##            records n.max n.start events     rmean se(rmean) median 0.95LCL
## group=high      91    91      91     53  913.1053  132.0717    518     460
## group=low       92    92      92     42 1186.0440  143.4653    666     598
##            0.95UCL
## group=high     691
## group=low       NA
## [1] "METTL8"
##            records n.max n.start events     rmean se(rmean) median 0.95LCL
## group=high      91    91      91     50  967.4709  123.8576    593     498
## group=low       92    92      92     45 1206.2863  144.5124    661     545
##            0.95UCL
## group=high     732
## group=low     1130
m3C_writers |>
  walk(
    plot_KM,
    df = m3C_writers_TCGA_norm_genecounts_survival,
    category = 'Uterus'
  )
## [1] "METTL2A"
##            records n.max n.start events     rmean se(rmean) median 0.95LCL
## group=high      28    28      28     11 2356.7010  397.0849   3115     714
## group=low       29    29      29     24  710.7536  125.3212    541     418
##            0.95UCL
## group=high      NA
## group=low      911
## [1] "METTL2B"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high      28    28      28     15 1969.475  385.2990    838     550
## group=low       29    29      29     20 1196.072  220.4269    771     522
##            0.95UCL
## group=high      NA
## group=low       NA
## [1] "METTL6"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high      28    28      28     15 1769.479  372.0517   1526     714
## group=low       29    29      29     20 1102.441  225.2465    667     522
##            0.95UCL
## group=high      NA
## group=low       NA
## [1] "METTL8"
##            records n.max n.start events     rmean se(rmean) median 0.95LCL
## group=high      28    28      28     15 1849.7537  348.2840   1591     714
## group=low       29    29      29     20  986.3187  199.6186    611     378
##            0.95UCL
## group=high      NA
## group=low       NA
m3C_writers |>
  walk(
    plot_KM,
    df = m3C_writers_TCGA_norm_genecounts_survival,
    category = 'Ovary'
  )
## [1] "METTL2A"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     212   212     212    133 1728.303  119.3594   1348    1199
## group=low      213   213     213    132 1791.977  131.1958   1355    1213
##            0.95UCL
## group=high    1492
## group=low     1650
## [1] "METTL2B"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     211   211     211    137 1745.609  124.6838   1359    1249
## group=low      214   214     214    128 1788.410  127.9364   1336    1155
##            0.95UCL
## group=high    1492
## group=low     1699
## [1] "METTL6"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     213   213     213    127 1883.474 143.47617   1348    1189
## group=low      212   212     212    138 1656.316  99.64171   1369    1213
##            0.95UCL
## group=high    1492
## group=low     1583
## [1] "METTL8"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     212   212     212    126 2021.881 141.54335   1492    1359
## group=low      213   213     213    139 1512.350  96.29616   1247    1088
##            0.95UCL
## group=high    1725
## group=low     1369
m3C_writers |>
  walk(
    plot_KM,
    df = m3C_writers_TCGA_norm_genecounts_survival,
    category = 'Lung'
  )
## [1] "METTL2A"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     553   553     553    221 2212.573  174.0980   1485    1338
## group=low      554   554     554    230 2612.685  210.8861   1655    1258
##            0.95UCL
## group=high    1736
## group=low     1975
## [1] "METTL2B"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     555   555     555    209 2649.204  231.9337   1499    1315
## group=low      552   552     552    242 2069.752  117.0246   1531    1258
##            0.95UCL
## group=high    1798
## group=low     1912
## [1] "METTL6"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     555   555     555    231 2277.850  197.3345   1423    1229
## group=low      552   552     552    220 2515.205  204.2985   1679    1379
##            0.95UCL
## group=high    1713
## group=low     2160
## [1] "METTL8"
##            records n.max n.start events   rmean se(rmean) median 0.95LCL
## group=high     552   552     552    234 2252.22  187.8763   1499    1288
## group=low      555   555     555    217 2561.99  220.5698   1528    1344
##            0.95UCL
## group=high    1933
## group=low     1790
m3C_writers |>
  walk(
    plot_KM,
    df = m3C_writers_TCGA_norm_genecounts_survival,
    category = 'Stomach'
  )
## [1] "METTL2A"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     220   220     220     83 1467.923  181.1143    940     762
## group=low      222   222     222     84 1864.486  163.4992   1095     652
##            0.95UCL
## group=high    2197
## group=low       NA
## [1] "METTL2B"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     220   220     220     82 1528.919  198.9949   1043     766
## group=low      222   222     222     85 1764.579  177.2954    881     652
##            0.95UCL
## group=high      NA
## group=low       NA
## [1] "METTL6"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     222   222     222     91 1570.857  187.5039    779     640
## group=low      220   220     220     76 1763.544  181.3018   1294     869
##            0.95UCL
## group=high      NA
## group=low       NA
## [1] "METTL8"
##            records n.max n.start events    rmean se(rmean) median 0.95LCL
## group=high     220   220     220     84 1449.755  196.2948    940     675
## group=low      222   222     222     83 1796.860  186.3028   1095     779
##            0.95UCL
## group=high      NA
## group=low       NA

Expression in tumor and normal

#

m3C_writers_TCGAtargetGTEx_norm_genecounts_Pancreas <- 
  m3C_writers_TCGAtargetGTEx_norm_genecounts |> 
  filter(primary_site_modified == 'Pancreas')
m3C_writers_TCGAtargetGTEx_norm_genecounts_Pancreas |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: Tables/TCGA_GTEx/m3C_writers_TCGAtargetGTEx_norm_genecounts_Pancreas_2024-07-29.tsv
## # A tibble: 1,400 × 12
##    gene_id  gene_name sample norm_count detailed_category primary disease or t…¹
##    <chr>    <chr>     <chr>       <dbl> <chr>             <chr>                 
##  1 ENSG000… METTL8    TCGA-…      10.6  Pancreatic Adeno… Pancreatic Adenocarci…
##  2 ENSG000… METTL8    TCGA-…       9.54 Pancreatic Adeno… Pancreatic Adenocarci…
##  3 ENSG000… METTL8    TCGA-…      10.7  Pancreatic Adeno… Pancreatic Adenocarci…
##  4 ENSG000… METTL8    TCGA-…       9.30 Pancreatic Adeno… Pancreatic Adenocarci…
##  5 ENSG000… METTL8    TCGA-…       9.43 Pancreatic Adeno… Pancreatic Adenocarci…
##  6 ENSG000… METTL8    TCGA-…      10.7  Pancreatic Adeno… Pancreatic Adenocarci…
##  7 ENSG000… METTL8    TCGA-…       9.72 Pancreatic Adeno… Pancreatic Adenocarci…
##  8 ENSG000… METTL8    TCGA-…      10.0  Pancreatic Adeno… Pancreatic Adenocarci…
##  9 ENSG000… METTL8    TCGA-…       9.43 Pancreatic Adeno… Pancreatic Adenocarci…
## 10 ENSG000… METTL8    TCGA-…       9.58 Pancreatic Adeno… Pancreatic Adenocarci…
## # ℹ 1,390 more rows
## # ℹ abbreviated name: ¹​`primary disease or tissue`
## # ℹ 6 more variables: primary_site <chr>, `_sample_type` <chr>,
## #   `_gender` <chr>, `_study` <chr>, sample_type <chr>,
## #   primary_site_modified <chr>
m3C_writers_TCGAtargetGTEx_norm_genecounts |> 
  filter(primary_site_modified == 'Pancreas') |> 
  group_by(gene_name) |> 
  rstatix::wilcox_test(norm_count ~ sample_type)
## # A tibble: 4 × 8
##   gene_name .y.        group1 group2    n1    n2 statistic        p
## * <chr>     <chr>      <chr>  <chr>  <int> <int>     <dbl>    <dbl>
## 1 METTL2A   norm_count Normal Tumor    171   179      183  1.74e-57
## 2 METTL2B   norm_count Normal Tumor    171   179      183  1.74e-57
## 3 METTL6    norm_count Normal Tumor    171   179      402. 6.90e-56
## 4 METTL8    norm_count Normal Tumor    171   179      674  6.29e-54
m3C_writers_expression_pancreas_violin <- 
  m3C_writers_TCGAtargetGTEx_norm_genecounts |> 
  filter(primary_site_modified == 'Pancreas') |> 
  ggplot(aes(
    x = sample_type |> str_remove('ormal|umor'), 
    y = norm_count, 
    fill = sample_type, 
    #colour = sample_type
  )) +
  geom_violin(lwd = .1) +
  geom_boxplot(width = .1, fill = 'white', coef = Inf, lwd = .1) +
  #ggforce::geom_sina(size = .2) +
  facet_wrap( ~ gene_name, scales = 'free_x', nrow = 1) +
  #scale_x_discrete(guide = ggh4x::guide_axis_nested(delim = '.') ) +
  scale_y_continuous(limits = c(0, 13)) +
  scale_fill_manual(values = c('blue', 'red')) +
  labs(x = '', y = 'log2 (normalized count + 1)') +
  #coord_flip() +
  theme_classic(base_size = 7) +
  theme(
    legend.position = 'bottom',
  #  axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1)
  )
m3C_writers_expression_pancreas_violin |> 
  ggsave_multiple_formats(
    outdir = fig_expression,
    width = 7, height = 4, fontsize = 7
  )

Sesison info

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.2.2 (2022-10-31)
##  os       macOS 14.5
##  system   aarch64, darwin20
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       Asia/Tokyo
##  date     2024-07-29
##  pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  ! package      * version    date (UTC) lib source
##    abind          1.4-5      2016-07-21 [1] CRAN (R 4.2.0)
##    backports      1.5.0      2024-05-23 [1] CRAN (R 4.2.3)
##    bit            4.0.5      2022-11-15 [1] CRAN (R 4.2.0)
##    bit64          4.0.5      2020-08-30 [1] CRAN (R 4.2.0)
##    broom          1.0.6      2024-05-17 [1] CRAN (R 4.2.3)
##    bslib          0.7.0      2024-03-29 [1] CRAN (R 4.2.3)
##    cachem         1.1.0      2024-05-16 [1] CRAN (R 4.2.3)
##    car            3.1-2      2023-03-30 [1] CRAN (R 4.2.0)
##    carData        3.0-5      2022-01-06 [1] CRAN (R 4.2.0)
##    class          7.3-22     2023-05-03 [1] CRAN (R 4.2.0)
##    cli            3.6.2      2023-12-11 [1] CRAN (R 4.2.3)
##    codetools      0.2-20     2024-03-31 [1] CRAN (R 4.2.3)
##    colorspace     2.1-0      2023-01-23 [1] CRAN (R 4.2.0)
##    crayon         1.5.2      2022-09-29 [1] CRAN (R 4.2.0)
##    curl           5.2.1      2024-03-01 [1] CRAN (R 4.2.3)
##    data.table     1.15.4     2024-03-30 [1] CRAN (R 4.2.3)
##    desc           1.4.3      2023-12-10 [1] CRAN (R 4.2.3)
##    devtools       2.4.5      2022-10-11 [1] CRAN (R 4.2.0)
##    dials          1.2.1      2024-02-22 [1] CRAN (R 4.2.3)
##    DiceDesign     1.10       2023-12-07 [1] CRAN (R 4.2.3)
##    digest         0.6.35     2024-03-11 [1] CRAN (R 4.2.3)
##    dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.2.3)
##    ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.2.0)
##    evaluate       0.24.0     2024-06-10 [1] CRAN (R 4.2.2)
##    fansi          1.0.6      2023-12-08 [1] CRAN (R 4.2.3)
##    farver         2.1.2      2024-05-13 [1] CRAN (R 4.2.3)
##    fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.2.3)
##    forcats      * 1.0.0      2023-01-29 [1] CRAN (R 4.2.0)
##    foreach        1.5.2      2022-02-02 [1] CRAN (R 4.2.0)
##    fs             1.6.4      2024-04-25 [1] CRAN (R 4.2.3)
##    furrr          0.3.1      2022-08-15 [1] CRAN (R 4.2.0)
##    future         1.33.2     2024-03-26 [1] CRAN (R 4.2.3)
##    future.apply   1.11.2     2024-03-28 [1] CRAN (R 4.2.3)
##    generics       0.1.3      2022-07-05 [1] CRAN (R 4.2.0)
##    ggforce        0.4.2      2024-02-19 [1] CRAN (R 4.2.3)
##    ggplot2      * 3.5.1      2024-04-23 [1] CRAN (R 4.2.3)
##    ggpubr       * 0.6.0      2023-02-10 [1] CRAN (R 4.2.0)
##    ggrepel        0.9.5      2024-01-10 [1] CRAN (R 4.2.3)
##    ggsignif       0.6.4      2022-10-13 [1] CRAN (R 4.2.0)
##    globals        0.16.3     2024-03-08 [1] CRAN (R 4.2.3)
##    glue           1.7.0      2024-01-09 [1] CRAN (R 4.2.3)
##    gower          1.0.1      2022-12-22 [1] CRAN (R 4.2.0)
##    GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.2.0)
##    gprofiler2     0.2.3      2024-02-23 [1] CRAN (R 4.2.3)
##    gridExtra      2.3        2017-09-09 [1] CRAN (R 4.2.0)
##    gtable         0.3.5      2024-04-22 [1] CRAN (R 4.2.3)
##    hardhat        1.4.0      2024-06-02 [1] CRAN (R 4.2.2)
##    highr          0.11       2024-05-26 [1] CRAN (R 4.2.3)
##    hms            1.1.3      2023-03-21 [1] CRAN (R 4.2.0)
##    htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.2.3)
##    htmlwidgets    1.6.4      2023-12-06 [1] CRAN (R 4.2.3)
##    httpuv         1.6.15     2024-03-26 [1] CRAN (R 4.2.3)
##    httr           1.4.7      2023-08-15 [1] CRAN (R 4.2.0)
##    ipred          0.9-14     2023-03-09 [1] CRAN (R 4.2.0)
##    iterators      1.0.14     2022-02-05 [1] CRAN (R 4.2.0)
##    jquerylib      0.1.4      2021-04-26 [1] CRAN (R 4.2.0)
##    jsonlite       1.8.8      2023-12-04 [1] CRAN (R 4.2.3)
##    km.ci          0.5-6      2022-04-06 [1] CRAN (R 4.2.0)
##    KMsurv         0.1-5      2012-12-03 [1] CRAN (R 4.2.0)
##    knitr          1.47       2024-05-29 [1] CRAN (R 4.2.3)
##    labeling       0.4.3      2023-08-29 [1] CRAN (R 4.2.0)
##    later          1.3.2      2023-12-06 [1] CRAN (R 4.2.3)
##    lattice        0.22-6     2024-03-20 [1] CRAN (R 4.2.3)
##    lava           1.8.0      2024-03-05 [1] CRAN (R 4.2.3)
##    lazyeval       0.2.2      2019-03-15 [1] CRAN (R 4.2.0)
##    lhs            1.1.6      2022-12-17 [1] CRAN (R 4.2.0)
##    lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.2.3)
##    listenv        0.9.1      2024-01-29 [1] CRAN (R 4.2.3)
##    lubridate    * 1.9.3      2023-09-27 [1] CRAN (R 4.2.0)
##    magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.2.0)
##    MASS           7.3-60.0.1 2024-01-13 [1] CRAN (R 4.2.3)
##    Matrix         1.6-5      2024-01-11 [1] CRAN (R 4.2.3)
##    memoise        2.0.1      2021-11-26 [1] CRAN (R 4.2.0)
##    mime           0.12       2021-09-28 [1] CRAN (R 4.2.0)
##    miniUI         0.1.1.1    2018-05-18 [1] CRAN (R 4.2.0)
##    munsell        0.5.1      2024-04-01 [1] CRAN (R 4.2.3)
##  R myUtilities  * 0.0.0.9000 <NA>       [?] <NA>
##    nnet           7.3-19     2023-05-03 [1] CRAN (R 4.2.0)
##    parallelly     1.37.1     2024-02-29 [1] CRAN (R 4.2.3)
##    parsnip        1.2.1      2024-03-22 [1] CRAN (R 4.2.3)
##    pillar         1.9.0      2023-03-22 [1] CRAN (R 4.2.0)
##    pkgbuild       1.4.4      2024-03-17 [1] CRAN (R 4.2.3)
##    pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.2.0)
##    pkgload        1.3.4      2024-01-16 [1] CRAN (R 4.2.3)
##    plotly         4.10.4     2024-01-13 [1] CRAN (R 4.2.3)
##    polyclip       1.10-6     2023-09-27 [1] CRAN (R 4.2.0)
##    prodlim        2023.08.28 2023-08-28 [1] CRAN (R 4.2.0)
##    profvis        0.3.8      2023-05-02 [1] CRAN (R 4.2.0)
##    promises       1.3.0      2024-04-05 [1] CRAN (R 4.2.3)
##    purrr        * 1.0.2      2023-08-10 [1] CRAN (R 4.2.0)
##    R6             2.5.1      2021-08-19 [1] CRAN (R 4.2.0)
##    ragg           1.3.2      2024-05-15 [1] CRAN (R 4.2.3)
##    Rcpp           1.0.12     2024-01-09 [1] CRAN (R 4.2.3)
##    readr        * 2.1.5      2024-01-10 [1] CRAN (R 4.2.3)
##    recipes        1.0.10     2024-02-18 [1] CRAN (R 4.2.3)
##    remotes        2.5.0      2024-03-17 [1] CRAN (R 4.2.3)
##    rlang          1.1.4      2024-06-04 [1] CRAN (R 4.2.2)
##    rmarkdown      2.27       2024-05-17 [1] CRAN (R 4.2.3)
##    rpart          4.1.23     2023-12-05 [1] CRAN (R 4.2.3)
##    rprojroot      2.0.4      2023-11-05 [1] CRAN (R 4.2.0)
##    rsample        1.2.1      2024-03-25 [1] CRAN (R 4.2.3)
##    rstatix        0.7.2      2023-02-01 [1] CRAN (R 4.2.0)
##    rstudioapi     0.16.0     2024-03-24 [1] CRAN (R 4.2.3)
##    sass           0.4.9      2024-03-15 [1] CRAN (R 4.2.3)
##    scales         1.3.0      2023-11-28 [1] CRAN (R 4.2.3)
##    sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.2.0)
##    shiny          1.8.1.1    2024-04-02 [1] CRAN (R 4.2.3)
##    stringi        1.8.4      2024-05-06 [1] CRAN (R 4.2.3)
##    stringr      * 1.5.1      2023-11-14 [1] CRAN (R 4.2.3)
##    survival     * 3.7-0      2024-06-05 [1] CRAN (R 4.2.2)
##    survminer    * 0.4.9.999  2022-11-21 [1] Github (kassambara/survminer@5ce1833)
##    survMisc       0.5.6      2022-04-07 [1] CRAN (R 4.2.0)
##    svglite        2.1.3      2023-12-08 [1] CRAN (R 4.2.3)
##    systemfonts    1.1.0      2024-05-15 [1] CRAN (R 4.2.3)
##    textshaping    0.4.0      2024-05-24 [1] CRAN (R 4.2.3)
##    tibble       * 3.2.1      2023-03-20 [1] CRAN (R 4.2.0)
##    tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.2.3)
##    tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.2.3)
##    tidyverse    * 2.0.0      2023-02-22 [1] CRAN (R 4.2.0)
##    timechange     0.3.0      2024-01-18 [1] CRAN (R 4.2.3)
##    timeDate       4032.109   2023-12-14 [1] CRAN (R 4.2.3)
##    tune           1.2.1      2024-04-18 [1] CRAN (R 4.2.3)
##    tweenr         2.0.3      2024-02-26 [1] CRAN (R 4.2.3)
##    tzdb           0.4.0      2023-05-12 [1] CRAN (R 4.2.0)
##    urlchecker     1.0.1      2021-11-30 [1] CRAN (R 4.2.0)
##    usethis        2.2.3      2024-02-19 [1] CRAN (R 4.2.3)
##    utf8           1.2.4      2023-10-22 [1] CRAN (R 4.2.0)
##    vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.2.3)
##    viridisLite    0.4.2      2023-05-02 [1] CRAN (R 4.2.0)
##    vroom          1.6.5      2023-12-05 [1] CRAN (R 4.2.3)
##    withr          3.0.0      2024-01-16 [1] CRAN (R 4.2.3)
##    workflows      1.1.4      2024-02-19 [1] CRAN (R 4.2.3)
##    xfun           0.44       2024-05-15 [1] CRAN (R 4.2.3)
##    xtable         1.8-4      2019-04-21 [1] CRAN (R 4.2.0)
##    yaml           2.3.8      2023-12-11 [1] CRAN (R 4.2.3)
##    yardstick      1.3.1      2024-03-21 [1] CRAN (R 4.2.3)
##    zoo            1.8-12     2023-04-13 [1] CRAN (R 4.2.0)
## 
##  [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## 
##  R ── Package was removed from disk.
## 
## ──────────────────────────────────────────────────────────────────────────────